DataExploration
import pandas as pd
# Only take the fifth column as the data header.
data = pd.read_csv("./Marker_records_30s.csv", header=5)
data.shape
data.head()
data.describe()
data.isnull().sum()
import numpy as np
def getMarkersPosition(lst):
pos = {
'f': 0,
't': 0.0,
'x': [],
'y': [],
'z': []
}
for i in data.columns:
if 'Frame' in i:
pos['f'] = list(lst[i])[0]
if 'Time' in i:
pos['t'] = list(lst[i])[0]
if 'X' in i and not np.isnan(
list(lst[i])[0]
):
pos['x'].append(
list(lst[i])[0]
)
if 'Y' in i and not np.isnan(
list(lst[i])[0]
):
pos['y'].append(
list(lst[i])[0]
)
if 'Z' in i and not np.isnan(
list(lst[i])[0]
):
pos['z'].append(
list(lst[i])[0]
)
return pos
import plotly.graph_objects as go
import plotly.io as pio
import plotly.offline as pltoff
# pio.renderers.default = "browser"
pio.renderers.default = "notebook"
def plotMarkers(pos):
title_txt = "{0} Markers Captureed in {1} Frame at {2} second".format(len(pos['x']), pos['f'], pos['t'])
# print(title_txt)
x, y, z = pos['x'], pos['y'], pos['z']
fig = go.Figure(data=[go.Scatter3d(
x=x,
y=y,
z=z,
mode='markers',
marker=dict(
size=3,
color='purple', # set color to an array/list of desired values
# colorscale='Viridis', # choose a colorscale
opacity=0.8
)
)])
# tight layout
# fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.update_layout(title=title_txt,
xaxis_title='Month',
yaxis_title='Temperature (degrees F)')
fig.show()
import random
rd = random.randint(1, data.shape[0])
f = getMarkersPosition(data[rd: rd+1])
plotMarkers(f)
rd = random.randint(1, data.shape[0])
f = getMarkersPosition(data[rd: rd+1])
plotMarkers(f)
rd = random.randint(1, data.shape[0])
f = getMarkersPosition(data[rd: rd+1])
plotMarkers(f)
Marker_num_collection = []
# for i in range(1, 50): # Test
for i in range(1, data.shape[0]):
pos = getMarkersPosition(data[i: i+1])
Marker_num = len(pos['x'])
f_num = pos['f']
Marker_num_collection.append(Marker_num)
# print(Marker_Num, f_num)
if f_num in [i*100 for i in range(46)]:
print("Processed {:.2}%".format(f_num/data.shape[0]))
from collections import Counter
c = Counter(Marker_num_collection)
for k,v in sorted(c.items()):
print(k,v)
sorted(c.items())
k = []
v = []
for i in range(len(sorted(c.items()))):
k.append(str(sorted(c.items())[i][0])+"_Markers")
v.append(sorted(c.items())[i][1])
pio.renderers.default = "notebook"
fig = go.Figure(data=[go.Pie(labels=k, values=v)])
fig.update_layout(title='Frame w/ Different Markers %')
fig.show()
\begin{equation} S_{48*2410} =\begin{bmatrix} x_{1}^{1} & x_2^{1} & ... & x_{16}^{1} & y_1^{1} & y_2^{1} & ... & y_{16}^{1} & z_1^{1} & z_2^{1} & ... & z_{16}^{1} \\ x_{1}^{2} & x_2^{2} & ... & x_{16}^{2} & y_1^{2} & y_2^{2} & ... & y_{16}^{2} & z_1^{2} & z_2^{2} & ... & z_{16}^{2} \\ \vdots & \vdots & \ddots & \vdots & \vdots & \vdots & \ddots & \vdots & \vdots & \vdots & \ddots & \vdots \\ x_{1}^{2410} & x_2^{2410} & ... & x_{16}^{2410} & y_1^{2410} & y_2^{2410} & ... & y_{16}^{2410} & z_1^{2410} & z_2^{2410} & ... & z_{16}^{2410} \\ \end{bmatrix} \end{equation}
lst_16m = []
# for i in range(1, 50): # Test
for i in range(1, data.shape[0]):
pos = getMarkersPosition(data[i: i+1])
Marker_num = len(pos['x'])
f_num = pos['f']
if Marker_num == 16:
line = pos['x'] + pos['y'] + pos['z']
lst_16m.append(line)
if f_num in [i*100 for i in range(46)]:
print("Processed {:.2%}".format(f_num/data.shape[0]))
X = np.array(lst_16m)
X.shape
X = X.T
X.shape
from sklearn.decomposition import PCA
from sklearn import preprocessing
X_scale = preprocessing.scale(X, axis = 1)
X_scale.std(axis = 1)
X_scale.shape
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
pca = PCA().fit(X_scale)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_)[:], 'o-', c='#663399', alpha=.5)
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()
n_lst =[ i*10 for i in range(1, 5)]
count =0
for n in n_lst:
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_)[:n], 'o-', c='#663399', alpha=.5)
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Test Dataset Explained Variance')
plt.show()
index = 0
for i in np.cumsum(pca.explained_variance_ratio_)[:20]:
index += 1
print("Dimension Size: {}, Variance: {}".format(index, i ))
import seaborn as sns
sns.set()
sns.despine(left=True)
fig, axes = plt.subplots(2, 2, figsize=(15, 10), gridspec_kw=dict(hspace=.4, wspace=.3))
title_settings={'fontsize':16}
subtitles=['Inversed samples with {} components']*3
# Plot Heatmap 1
ax = sns.heatmap(X_scale, cbar=False, ax=axes[0, 0])
ax.set_title('Original samples', **title_settings)
# Plot Heatmap 2, 3, and 4
plot_ind=[[0,1],[1,0],[1,1]]
n_components=[1,2,3,4,5,6]
for nc, title, i in zip(n_components, subtitles, plot_ind):
pca = PCA(n_components=nc)
PC = pca.fit_transform(X_scale)
inversed = pca.inverse_transform(PC)
ax = sns.heatmap(inversed, cbar=False, ax=axes[i[0],i[1]])
ax.set_title(title.format(nc), **title_settings)